Die MNIST-Datenbank (Modified National Institute of Standards and Technology database) ist eine öffentlich verfügbare Datenbank von handgeschriebenen Ziffern. Die MNIST-Datenbank besteht aus 60.000 Beispielen (28x28 grayscale Bilder) im Trainingsdatensatz und 10.000 Beispielen im Testdatensatz.

Wir werden eine einfache Neurale Netzwerk aus 2 hidden Schichten (500 bzw. 50 Knoten) bilden und werden uns Loss Landschaft für 2 Gewichte in letzten Layer anschauen. Überwachung diese 2 Gewichte lässt sich wieder qualitative Unterschied zwischen Algorythmen zeigen.

Lassen uns anfangen. Wir importieren alle benötigte Module und bereiten Daten vor.
import numpy as np
from keras.datasets import mnist
from sklearn.preprocessing import OneHotEncoder
from scipy.special import expit
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly import tools
import plotly.offline
plotly.offline.init_notebook_mode(connected=True)
Die Eingabedaten sind 60,000 grayscale 28x28 Bilder. \ D.h. die X-Werte sind die Zahlen zwischen 0 und 255. Für Normalisierung reicht sie durch 255 zu teilen. \ Wir werden auch sie aus 28x28 Matrice zum 1x784 Vektor strecken.
oh=OneHotEncoder(categories='auto') # Setup OneHotEncoder für y
# Daten Laden
(train_X, train_y), (test_X, test_y) = mnist.load_data()
train_X = train_X.reshape(-1, 784) # strecken
train_X = train_X / 255 # normieren
train_y_oh = oh.fit_transform(train_y.reshape(-1, 1)).toarray() # one-hot-encoding of y-values
Als Aktivierung-Funktion wird Sigmoid benutzt $S(x)=\frac{1}{1+e^{-x}}$
hidden_0=50 # Anzahl Knoten in ersten hidden Layer
hidden_1=500 # Anzahl Knoten in zweiten hidden Layer
# Kosten Funktion:
def costs(x,y,w_a,w_b,seed_):
np.random.seed(seed_) # insert random seed
w0 = np.random.randn(hidden_0,784) # weight matrix of 1st hidden layer
w1 = np.random.randn(hidden_1,hidden_0) # weight matrix of 2nd hidden layer
w2 = np.random.randn(10,hidden_1) # weight matrix of output layer
w2[5][250] = w_a # set value for weight w_250,5(2)
w2[5][251] = w_b # set value for weight w_251,5(2)
a0 = expit(w0 @ x.T) # output of 1st hidden layer
a1 = expit(w1 @ a0) # output of 2nd hidden layer
pred = expit(w2 @ a1) # output of final layer
return np.mean(np.sum((y-pred)**2,axis=0)) # Kosten in Bezug auf w_a and w_b
# Meshgrid:
m1s = np.linspace(-15, 17, 40)
m2s = np.linspace(-15, 18, 40)
M1, M2 = np.meshgrid(m1s, m2s) # Meshgrid erstellen
# Besteimmen die Kosten in Meshgrid:
zs_100 = np.array([costs(train_X[0:100],train_y_oh[0:100].T
,np.array([[mp1]]), np.array([[mp2]]),135)
for mp1, mp2 in zip(np.ravel(M1), np.ravel(M2))])
Z_100 = zs_100.reshape(M1.shape) # z-Werte für N = 100
zs_1000 = np.array([costs(train_X[0:1000],train_y_oh[0:1000].T
,np.array([[mp1]]), np.array([[mp2]]),135)
for mp1, mp2 in zip(np.ravel(M1), np.ravel(M2))])
Z_1000 = zs_1000.reshape(M1.shape) # z-Werte für N = 1,000
# N = 10,000 ist auch möglich aber dauert wesentlich länger
Plot der Loss Landschaft für unterschiedliche Anzahl von Samplen:
def cam_change(layout, camera):
fig_widget.layout.scene2.camera = camera
fig_100 = dict(type='surface',x=M1,y=M2,z=Z_100,scene='scene1')
fig_1000 = dict(type='surface',x=M1,y=M2,z=Z_1000,scene='scene2')
fig = make_subplots(rows=1, cols=2,specs=[[{"type": "surface"},{"type": "surface"}]],
subplot_titles=("N:100","N:1000"))
fig.append_trace(fig_100, row=1, col=1)
fig.append_trace(fig_1000, row=1, col=2)
fig.update_traces(contours_z=dict(show=True, usecolormap=True,
highlightcolor="limegreen", project_z=True))
fig_widget = go.FigureWidget(fig)
fig_widget.layout.scene1.camera.eye=dict(x=2, y=-2, z=2)
fig_widget.layout.scene2.camera.eye=dict(x=2, y=-2, z=2)
fig_widget.layout.scene1.on_change(cam_change, 'camera')
fig_widget.show()
Lassen uns Netzwerk trainieren, aber nur die Gewichte, die wir überwachen.
# Liste für Ausgabe später
methods = ['SGD',
'SGD-Momentum',
'Nesterov-SGD',
'RMSProp',
'Adam']
mcolors = {'SGD': ['red', 'darkred'],
'SGD-Momentum': ['green', 'darkgreen'],
'Nesterov-SGD': ['blue', 'darkblue'],
'RMSProp':['yellow', 'yellow'],
'Adam':['pink','pink']}
# Liste zum Speichern von Gewichte und Kosten:
weights_2_5_250=[]
weights_2_5_251=[]
costs=[]
seed_= 135 # random seed
N=100 # sample size
pflag=False # Flag für Progress
# Set up neurale Netzwerk:
class NeuralNetwork(object):
def __init__(self, lr=0.01):
self.lr=lr
np.random.seed(seed_) # set random seed
# Intialize weight matrices:
self.w0=np.random.randn(hidden_0,784)
self.w1=np.random.randn(hidden_1,hidden_0)
self.w2=np.random.randn(10,hidden_1)
self.w2[5][250] = start_a # set starting value for w_a
self.w2[5][251] = start_b # set starting value for w_b
# init Momenten für Momentum, Nesterov und RMSProp
self.rho = 0.9
self.vx = np.zeros((10,500)) # shape
self.grad_squared = np.zeros((10,500))
# Für Adam
self.counter=0
self.beta1 = 0.9
self.beta2 = 0.999
self.first_moment = 0
self.second_moment = 0
self.first_unbias = 0
self.secnd_unbias = 0
def evaluate_gradient(self, a1, X, y, pred):
temp = (pred - y.T)*pred*(1-pred) @ a1.T / len(X)
return temp
def SGD_train(self, X, y, method):
a0 = expit(self.w0 @ X.T)
a1 = expit(self.w1 @ a0)
pred = expit(self.w2 @ a1)
# Partialle Ableitungen für Kosten in Bezug auf Gewichte in das output Schicht:
dw2= self.evaluate_gradient(a1, X, y, pred)
# Updaten NUR unsere Gewichte:
if method=='SGD':
self.w2[5][250]=self.w2[5][250] - self.lr * dw2[5][250]
self.w2[5][251]=self.w2[5][251] - self.lr * dw2[5][251]
costs.append(self.cost(pred,y)) # List updaten mit Kosten
if method=='SGD-Momentum':
self.vx = self.rho * self.vx + dw2
self.w2[5][250]=self.w2[5][250] - self.lr * self.vx[5][250]
self.w2[5][251]=self.w2[5][251] - self.lr * self.vx[5][251]
costs.append(self.cost(pred,y)) # List updaten mit Kosten
if method=='Nesterov-SGD':
old_vx = self.vx
self.vx = self.rho * self.vx - self.lr * dw2
#print(old_vx)
#print(self.vx)
#print(self.dw2)
self.w2[5][250]=self.w2[5][250] + self.vx[5][250] + self.rho * (self.vx[5][250]-old_vx[5][250])
self.w2[5][251]=self.w2[5][251] + self.vx[5][251] + self.rho * (self.vx[5][251]-old_vx[5][251])
costs.append(self.cost(pred,y)) # List updaten mit Kosten
if method=='RMSProp':
self.grad_squared = self.rho * self.grad_squared +(1-self.rho)* dw2 * dw2
#print(self.grad_squared.shape)
#print(dw2.shape)
#print(np.sqrt(self.grad_squared[5][250]))
self.w2[5][250]=self.w2[5][250] - self.lr * dw2[5][250]/(np.sqrt(self.grad_squared[5][250])+1e-1)
self.w2[5][251]=self.w2[5][251] - self.lr * dw2[5][251]/(np.sqrt(self.grad_squared[5][251])+1e-1)
costs.append(self.cost(pred,y)) # List updaten mit Kosten
if method=='Adam':
self.first_moment = self.beta1 * self.first_moment + (1 - self.beta1) * dw2
self.second_moment = self.beta2 * self.second_moment + (1 - self.beta2) * dw2 * dw2
self.first_unbias = self.first_moment / (1-self.beta1**(self.counter+1))
self.second_unbias = self.second_moment / (1-self.beta2**(self.counter+1))
self.counter += 1
self.w2[5][250] = self.w2[5][250] - self.lr * self.first_unbias[5][250]/(np.sqrt(self.second_unbias[5][250])+1e-1)
self.w2[5][251] = self.w2[5][251] - self.lr * self.first_unbias[5][251]/(np.sqrt(self.second_unbias[5][251])+1e-1)
costs.append(self.cost(pred,y)) # List updaten mit Kosten
def cost(self, pred, y):
return np.mean(np.sum((y.T-pred)**2,axis=0))
# Anfangswerte für w_a/w_b:
starting_points = [ (-9,15)]# ,(-10.1,15),(-11,15)]
epochs = 1000 # 1,000 epochs
for method in methods:
print('Method: ', method)
start_a,start_b=starting_points[0]
model=NeuralNetwork(10) # set learning rate to 10
for i in range(epochs):
model.SGD_train(train_X[0:N], train_y_oh[0:N], method)
weights_2_5_250.append(model.w2[5][250]) # append weight values to list
weights_2_5_251.append(model.w2[5][251]) # append weight values to list
if i % (int(epochs/20)) == 0 and pflag == True:
print("Epoch ", i, " of ", epochs, ".")
print("Fertig!")
# Subliste für Kosten und Gewichte abhängig von Algorythmus:
costs = np.split(np.array(costs),len(methods))
weights_2_5_250 = np.split(np.array(weights_2_5_250),len(methods))
weights_2_5_251 = np.split(np.array(weights_2_5_251),len(methods))
# Welche Epochen zu plotten:
lower_bound = int(epochs/500)
p1=list(np.arange(0,lower_bound,20))# ungleichmäßig
p2=list(np.arange(lower_bound,epochs,100))
p3=list(np.arange(0,epochs,20)) # gleichmäßig
#points_=p1+p2
points_=p3
counter = 0
scaler=1.001
# Plotly Figur
fig_dict = {
"data": [],
"layout": {},
"frames": []
}
fig_dict["data"].append(fig_100) # trace 0
for i in range(len(methods)):
# add each graph 2 times
fig_dict["data"].append(go.Scatter3d(x=weights_2_5_250[i][0:epochs],
y=weights_2_5_251[i][0:epochs],
z=costs[i][0:epochs]*scaler, #TODO add some value to emphasize
line=dict(color=mcolors[methods[i]][0],width=5),
name=methods[i],mode='lines'))
fig_dict["data"].append(go.Scatter3d(x=weights_2_5_250[i][0:epochs],
y=weights_2_5_251[i][0:epochs],
z=costs[i][0:epochs]*scaler, #TODO add some value to emphasize
line=dict(color=mcolors[methods[i]][0],width=5),
name=methods[i], mode='lines',showlegend=True))
fig_dict["layout"]["updatemenus"] = [
{
"buttons": [
{
"args": [None, {"frame": {"duration": 500, "redraw": True},
"fromcurrent": True, "transition": {"duration": 300,
"easing": "quadratic-in-out"}}],
"label": "Play",
"method": "animate"
},
{
"args": [[None], {"frame": {"duration": 0, "redraw": True},
"mode": "immediate",
"transition": {"duration": 0}}],
"label": "Pause",
"method": "animate"
}
],
"direction": "left",
"pad": {"r": 10, "t": 87},
"showactive": False,
"type": "buttons",
"x": 0.1,
"xanchor": "right",
"y": 0,
"yanchor": "top"
}
]
sliders_dict = {
"active": 0,
"yanchor": "top",
"xanchor": "left",
"currentvalue": {
"font": {"size": 20},
"prefix": "Epoch:",
"visible": True,
"xanchor": "right"
},
"transition": {"duration": 300, "easing": "cubic-in-out"},
"pad": {"b": 10, "t": 50},
"len": 0.9,
"x": 0.1,
"y": 0,
"steps": []
}
data_dict = {
"x": M1,
"y": M2,
"z": Z_100,
"type": "surface",
}
# frames
for i in range(len(points_)):
epoch = points_[i]
fdata=[]
fdata3=dict(type='surface',x=M1,y=M2,z=Z_100,name='Loss Landschaft')
fdata.append(fdata3)
for j in range(len(methods)):
fdata1=go.Scatter3d(
x=[weights_2_5_250[j][epoch]],
y=[weights_2_5_251[j][epoch]],
z=[costs[j][epoch]*scaler],
name=methods[j] +' trace',
mode="markers", marker=dict(color=mcolors[methods[j]][1], size=10))
fdata2=go.Scatter3d(x=weights_2_5_250[j][0:epoch],
y=weights_2_5_251[j][0:epoch],
z=costs[j][0:epoch]*scaler,
line=dict(color=mcolors[methods[j]][0],width=5),
name=methods[j],mode='lines')
fdata.append(fdata1)
fdata.append(fdata2)
tlist=list(range(1+3*len(methods)))
frame = go.Frame(data=fdata, traces=tlist,name= str(epoch))
fig_dict["frames"].append(frame)
slider_step = {"args": [
[str(epoch)],
{"frame": {"duration": 300, "redraw": True},
"mode": "immediate",
"transition": {"duration": 1}}
],
"label": str(epoch),
"method": "animate"}
sliders_dict["steps"].append(slider_step)
fig_dict["layout"]["sliders"] = [sliders_dict]
fig = go.Figure(fig_dict)
fig.update_traces(contours_z=dict(show=True, usecolormap=True,
highlightcolor="limegreen", project_z=True),selector=dict(type='surface'))
fig.update_layout(title='Loss landschaft und Trajektorien', #autosize=False,
scene_camera_eye=dict(x=2, y=-2, z=2),
width=800, height=800,
margin=dict(l=15, r=20, b=15, t=60),
legend=dict(
yanchor="top",
y=0.99,
xanchor="left",
x=0.01
)
)
fig.layout.scene.xaxis.range = [-15, 17]
fig.layout.scene.yaxis.range = [-15, 18]
#fig.layout.scene.zaxis.range = [-10, 10]
fig.show()